/*** CREATES CAMPUS x GRADE x YEAR DATASET WITH MOBILITY AND TRACKING MEASURES ***/

clear all
set more 1

do "paths.do"

cd "$WORKING"

capture log close
log using "$LOGFILES/txtr_2.log", replace

/*** READ IN ERC TRACKING DATA (camp x grd x year) ***/

** read in tracking measures by prior test scores
use "$TRACKDATA/tracking_7.dta", clear
descr

rename *, lower

** class size distribution
summ numcls clssz*
rename numcls cgclsnum
 la var cgclsnum "camp x grd calc: number of classes"
* avgclssz is the student weighted average
rename avgclssz cgclsszavg
 la var cgclsszavg "camp x grd calc: avg class size (stud weighted)"
* classes are arranged in descending order of size
assert clssz1>=clssz2 if clssz2!=.
gen cgclsszmax = clssz1
 la var cgclsszmax "camp x grd calc: max class size"
* minimum class size may be masked if fewer than 5 students
egen tmp1 = rownonmiss(clssz*)
egen cgclsszmin = rowmin(clssz*) if (tmp1 == cgclsnum)
 la var cgclsszmin "camp x grd calc: min class size (missing if <5)"
summ cgclssz*
drop clssz* tmp*


** set missing values to missing and convert counts to shares
summ

for any cgnomath cgdup cgmult: replace X=X/cgenrl
for any cgnomath cgdup cgmult: replace X=0 if (X==.)

rename schsz cgschsz

** label variables
la var cgmale "camp x grd calc: % stud male"
la var cghisp "camp x grd calc: % stud hispanic"
la var cgblack "camp x grd calc: % stud black"
la var cgwhite "camp x grd calc: % stud white"
la var cgasian "camp x grd calc: % stud asian"
la var cgother "camp x grd calc: % stud other race/eth"
la var cgage "camp x grd calc: average stud age (as of Sept)"
la var cgdisadv "camp x grd calc: % stud economically disadvantaged"

la var cgbil_2way "camp x grd calc: % stud in bilingual 2-way"
la var cgbil_non2way "camp x grd calc: % stud in bilingual non-2-way"
la var cgesl_con "camp x grd calc: % stud in esl con"
la var cgesl_pull "camp x grd calc: % stud in esl pullout"

la var cggifted "camp x grd calc: % stud in gifted/talented program"

la var cgdis_phy "camp x grd calc: % stud physical disability"
la var cgdis_mal "camp x grd calc: % stud malleable disability"
la var cgdis_spe "camp x grd calc: % stud speech disability"
la var cgdis_oth "camp x grd calc: % stud other disability"

la var cgvoced "camp x grd calc: % stud in vocational ed"
la var cgtitle1 "camp x grd calc: % stud Title I"
la var cglep "camp x grd calc: % stud limited English proficient"
la var cgspedr "camp x grd calc: % stud served in restrictive setting"
la var cgcmath "camp x grd calc: % stud w/math course"
la var cgcgeneric "camp x grd calc: % stud w/generic course"
la var cgcoth "camp x grd calc: % stud w/only non-math/generic courses"
la var cgnomath "camp x grd calc: % stud w/no math or generic courses"
la var cgdup "camp x grd calc: % stud w/dupl course records (same Sservice)"
la var cgenrl "camp x grd calc: enrollment, pre-tracking samp"
la var cgschsz "camp x grd calc: enrollment, tracking samp"
la var cgmult "camp x grd calc: % stud w/multiple math courses"
la var cgmisslm "camp x grd calc: % stud missing lag math score, tracking samp"
la var cgmnzlm "camp x grd calc: avg lagged math z-score, tracking samp"
la var cgsdzlm "camp x grd calc: std dev lagged math z-score, tracking samp"
for any logrd higrd grdgrp: rename cX ceX
la var celogrd "camp stud: lowest grade w/enrollment"
la var cehigrd "camp stud: highest grade w/enrollment"
la var cegrdgrp "camp stud: campus grade group"
tab cegrdgrp, m
destring cegrdgrp, force replace
 label define schtype 1 "elementary" 2 "middle sch" 3 "junior high" 4 "high school" 5 "elem/secondary"
 label values cegrdgrp schtype
la var ct_reg "camp type: instructional campus"
la var ct_altrg "camp type: alternative instr unit, regular acct system"
la var ct_altsp "camp type: alternative instr unit, alt acct system"
la var ct_juv "camp type: juvenile justice alt ed pgm"
la var ct_daep "camp type: DAEP only campus"
la var ct_chrtd "camp type: member of open enr chrtr dist w/<75% at risk stud"
la var ct_chrtc "camp type: campus-level charter"
la var ct_chrtu "camp type: member of open enr college chrtr dist"

** save dataset

descr

destring campus, force replace
save "$WORKING/temp/txtr_2_0.dta", replace

** add tracking measures by student demographics
import delimited "$TRACKDATA/tracking_8.dta", clear
keep campus grade year schsz poc frpl track*poc track*frpl
descr
rename poc cgpoc
rename frpl cgfrpl
 la var cgpoc "camp x grd calc: % stud nonwhite"
 la var cgfrpl "camp x grd calc: % stud econmically disadvantaged"

gen subj = "math"

merge 1:1 subj campus grade year using "$WORKING/temp/txtr_2_0.dta", update
tab year _merge
summ schsz if _merge==1
summ schsz if _merge==3
drop if _merge==1
drop _merge schsz




/*** CREATE NEEDED TRACKING MEASURES ***/

/* 
Tracking variables are renamed to the following form:
track_[TARGET]_[EXPRESSION]_[FRAME]
 [TARGET] = {zscore, rscore, poc, frpl}, the student characteristic driving sorting
 [EXPRESSION] = {rho, sigma}, the formula that takes a partition of target values
                as input and gives a single measure as output
 [FRAME] = {fabs, vrel, mrel}, whether the measure is adjusted to reflect the level
           of tracking within the number and size of classes constraint
*/

/*
[TARGET] values:
zscore = scaled score on math test, normalized by mean and std dev for the grade-year
poc = 1 if the student has any non-white race or ethnicity status, 0 otherwise
frpl = 1 if the student qualifies for free or reduced-price lunch, 0 otherwise
*/
global TARGET_TYPES zscore poc frpl

/*
[FRAME] notes:
In tracking_7 and tracking_8, we only calculate tracking measures with [FRAME] = fabs,
 that are not adjusted for the number and size of classes. We have the means and standard deviations
 of our simulated tracking measures under random assignment and purposeful assignment, which can be
 used to calculate the measures for [FRAME] = {vrel, mrel}.
*/


foreach tt in $TARGET_TYPES {
	local tshort2 = substr("`tt'", 1, 4)
	
	/* Create temporary variables */
	
	local eshort = "r"
	local eshort2 = "rho"
	
	if inlist("`tt'", "rscore", "zscore") {
		// use naming convention from tracking_7
		local tshort = substr("`tt'", 1, 1)
		gen tmp_track = `tshort'track`eshort'
		gen tmp_mean = `tshort'track`eshort'avg
		gen tmp_sd = `tshort'track`eshort'sd
		gen tmp_max = `tshort'track`eshort'avg_max
		// gen tmp_msd = `tshort'track`eshort'sd_max
	}
	else {
		// use naming convention from tracking_8
		gen tmp_track = track`eshort'_`tt'
		gen tmp_mean = track`eshort'avg_`tt'
		gen tmp_sd = track`eshort'sd_`tt'
		gen tmp_max = track`eshort'avg_max_`tt'
	}
	
	/*
	Rename and calculate sorting measures using standard taxonomy:
	track_[TARGET]_[EXPRESSION]_[FRAME]
	*/
	gen trk_`tshort2'_fa = tmp_track
	gen trk_`tshort2'_vr = (tmp_track - tmp_mean) / (tmp_sd)
	gen trk_`tshort2'_mr = (tmp_track - tmp_mean) / (tmp_max - tmp_mean)
	
	la var trk_`tshort2'_fa "camp x grd calc: `tt' tracking, unadjusted (frame-abs) `ee'"
	la var trk_`tshort2'_vr "camp x grd calc: `tt' tracking, standardized (var-rel) `ee'"
	la var trk_`tshort2'_mr "camp x grd calc: `tt' tracking, relative (max-rel) `ee'"
	
	/*
	Rename summaries of permutations under randomized and purposeful assignment:
	track_[TARGET]_[EXPRESSION]_[ASSIGMENT]_[STATISTIC]
	[ASSIGMENT] = {rand, purp}
	[STATISTIC] = {mean, sd}
	*/
	gen trk_`tshort2'_rm = tmp_mean
	gen trk_`tshort2'_rd = tmp_sd
	gen trk_`tshort2'_pm = tmp_max
	
	la var trk_`tshort2'_rm "camp x grd calc: `tt' tracking, mean of random assignment `ee'"
	la var trk_`tshort2'_rd "camp x grd calc: `tt' tracking, sd of random assignment `ee'"
	la var trk_`tshort2'_pm "camp x grd calc: `tt' tracking, mean of purposeful assignment `ee'"
	
	// calculate based on t-distribution
	// two-sided
	// based on degrees of freedom, which is (cgschsz - cgclsnum)
	// count negative values of the standardized measure as no tracking
	gen trk_`tshort2'_pv = 1 - t(cgschsz - cgclsnum, trk_`tshort2'_vr)
	la var trk_`tshort2'_pv "camp x grd calc: `tt' tracking, p-value from standardized (var-rel) `ee'"
	
	
	drop tmp_*
}

summ *_pv
corr *_pv

** drop all tracking variables not renamed and reorganized
drop *trackr*

descr, full
save "$WORKING/temp/txtr_2_1.dta", replace

use "$WORKING/temp/txtr_2_1.dta", clear
gen tmp1 = 1 if subj == "math"
replace tmp1 = 2 if subj == "ela"
replace tmp1 = 3 if subj == "sci"
replace tmp1 = 4 if subj == "soc"
sort campus grade year tmp1
by campus grade year: gen tmp2 = _n
keep if tmp2 == 1
foreach vv in fa vr mr rm rd pm pv {
	foreach ss in ela sci soc {
		replace trk_zsco_`vv' = . if (subj != "math")
	}
}
drop subj tmp*
save "$WORKING/temp/txtr_2_2.dta", replace

use "$WORKING/temp/txtr_2_1.dta", clear
keep if (subj != "math")
keep campus grade year subj trk_zsco_*
reshape wide trk_zsco_*, i(campus grade year) j(subj) string
foreach vv in fa vr mr rm rd pm pv {
	foreach ss in ela sci soc {
		rename trk_zsco_`vv'`ss' trk_`ss'_`vv'
	}
}
save "$WORKING/temp/txtr_2_3.dta", replace

use "$WORKING/temp/txtr_2_2.dta", clear
merge 1:1 campus grade year using "$WORKING/temp/txtr_2_3.dta"
assert _merge != 2
drop _merge
save "$WORKING/temp/txtr_2_4.dta", replace

/*** COMBINE WITH PUBLIC CAMPUS x YEAR DATASET ***/

use "$WORKING/txtr_1", clear

** create grade-specific average class size and enrollment measures
descr cc*a
summ cc*a
* use secondary math for grades 7 and 8
rename ccmata ccg07a
gen ccg08a=ccg07a
drop ccgkga ccg01a ccg02a ccg03a ccgmea
rename ccg0*a cgclssza*
foreach num of numlist 4/8 {
 gen cgenrt`num'=ceg0`num'c
 gen cflhasg`num'=(cgenrt`num'>0 & cgenrt`num'!=.)
 la var cflhasg`num' "camp flag: has enrollment in grade `num'"
}

** reshape to camp x grd x year dataset for grades 4-8
reshape long cgenrt cgclssza, i(campus year) j(grade)
 la var cgenrt "camp x grade: enrollment"
 la var cgclssza "camp x grade: average class size"
 la var grade "grade"
tab year grade, m
drop if cgenrt==0
tab year grade, m
for num 4/8: summ cflhasg* if cflhasgX==1

** keep 2011+
tab year
drop if year<2011

** merge to tracking measures
merge 1:1 campus grade year using "$WORKING/temp/txtr_2_4", update
format campus %12.0f
list campus grade year cgenrl ct_* if _merge==2
drop if _merge==2
summ cgenrt if _merge==1, detail
summ cgenrt if _merge==3, detail
gen cgflhastrack=(_merge==3)
summ cgflhastrack
summ cgflhastrack [aw=cgenrt]
tabstat cgflhastrack, by(year)
tabstat cgflhastrack [aw=cgenrt], by(year)
 la var cgflhastrack "camp x grd flag: has tracking information"
drop _merge

** compare related AEIS/TPR variables to those calculated in tracking programs
summ cgenrt if cgenrl==.
summ cgenrt cgenrl if cgenrl!=.
corr cgenrt cgenrl
tabstat cgenrt cgenrl, by(year)
descr cgclsszavg cgclssza
summ cgclsszavg cgclssza
corr cgclsszavg cgclssza
tabstat cgclssza*, by(grade)
drop cgclssza


/*** CREATE DISTRICT-LEVEL TRACKING MEASURES ***/

gen cegrlevel = "elementary"
replace cegrlevel = "middle" if inlist(grade, 6, 7, 8)
 la var cegrlevel "camp stud: grade level (elem=up to G5)"

foreach ff in fa vr mr {
	egen tmp1 = sum(cgenrl) if trk_zsco_`ff'!=., by(distnum cegrlevel)
	gen tmp2 = trk_zsco_`ff' * cgenrl / tmp1
	egen trk_dwavg_zsco_`ff' = sum(tmp2), by(distnum cegrlevel)
	drop tmp*
}


/*** RESTRICT SAMPLE TO REGULAR INSTRUCTIONAL CAMPUSES WITH TRACKING MEASURES ***/

program define docounts
	egen tmp1d=tag(distnum)
	egen tmp1c=tag(campus)
	egen double tmp1s=sum(cgenrt)
	disp "** number of districts, campuses and students **"
	count if tmp1d==1
	count if tmp1c==1
	list tmp1s in 1
	egen tmp2d=tag(distnum year)
	egen tmp2c=tag(campus year)
	disp "** number of district-years and campus-years **"
	count if tmp2d==1
	count if tmp2c==1
	disp "** number of district-years and campus-years w/each grade"
	for num 4/8: count if DcflhasgX==1 & tmp2d==1 \ count if cflhasgX==1 & tmp2c==1
	drop tmp*
end

** counts of districts, campuses and students (grades 4-8)
for var cflhasg*: egen DX=max(X), by(distnum year)
* full sample
docounts
* RESTRICT to those with tracking data
summ cgenrt if cgflhastrack==0, detail
summ cgenrt if cgflhastrack==1, detail
drop if cgflhastrack==0
docounts
* EXCLUDE non-regular campuses
summ ct_*
drop if ct_reg!=1
docounts
drop Dcflhasg* cgflhastrack ct_reg ct_altrg ct_altsp ct_juv ct_daep

/*** Create measures of sorting between campuses within a district ***/
gen nstud_campus = cgschsz
egen nstud_district = sum(nstud_campus), by(distnum grade year)
gen one0 = 1
egen n_obs = sum(one0), by(distnum grade year)
gen sum_campus = cgmnzlm * nstud_campus
gen sum2_campus = ((cgsdzlm ^ 2) * (nstud_campus - 1)) + ((cgmnzlm ^ 2) * nstud_campus)
egen sum_district = sum(sum_campus), by(distnum grade year)
egen sum2_district = sum(sum2_campus), by(distnum grade year)
gen lambda_district = (sum_district ^ 2) / nstud_district
gen iota_district = sum2_district
gen kc0 = (sum_campus ^ 2) / nstud_campus
egen kappa_campus = sum(kc0), by(distnum grade year)
gen trk_zsco_camp_fa = (kappa_campus - lambda_district) / (iota_district - lambda_district)
drop nstud_* one0 n_obs sum* lambda_* iota_* kc0 kappa_*

save "$WORKING/txtr_2.dta", replace

log close
